package util;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;

import net.htmlparser.jericho.Source;


public class HtmlParser {

	public String[] parse(URL url) throws IOException {
		try {
			Source source = new Source(url);
	
			String rendered = source.getRenderer().toString();
			
			String resultString = rendered.replaceAll("[^\\p{ASCII}]", " ");
			//String resultString = rendered.replaceAll("[^\\p{L}]", " ");

			String[] tokens = resultString.split("\\s");
			
			for (int i = 0; i < tokens.length; i++) {
				tokens[i] = tokens[i].toLowerCase().trim();
			}
			
			return tokens;
			
		} catch (FileNotFoundException ex) {
			//Logger.error("Couldn't fild URL: " + url, ex);
			
			return new String[0];
		}
	}

}
